import pandas as pd
import plotly.express as px
import plotly.io as pio
from pyspark.sql import SparkSession
import re
import numpy as np
import plotly.graph_objects as go
from pyspark.sql.functions import col, split, explode, regexp_replace, transform, when
from pyspark.sql import functions as F
from pyspark.sql.functions import col, monotonically_increasing_idAssignment
np.random.seed(2)
pio.renderers.default = "notebook"
# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()
# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("data/lightcast_job_postings.csv")
df.createOrReplaceTempView("job_postings")
# Show Schema and Sample Data
#print("---This is Diagnostic check, No need to print it in the final doc---")
#df.printSchema() # comment this line when rendering the submission
df.show(5)[Stage 14:> (0 + 1) / 1]
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
| ID|LAST_UPDATED_DATE|LAST_UPDATED_TIMESTAMP|DUPLICATES| POSTED| EXPIRED|DURATION| SOURCE_TYPES| SOURCES| URL|ACTIVE_URLS|ACTIVE_SOURCES_INFO| TITLE_RAW| BODY|MODELED_EXPIRED|MODELED_DURATION| COMPANY| COMPANY_NAME|COMPANY_RAW|COMPANY_IS_STAFFING|EDUCATION_LEVELS|EDUCATION_LEVELS_NAME|MIN_EDULEVELS| MIN_EDULEVELS_NAME|MAX_EDULEVELS|MAX_EDULEVELS_NAME|EMPLOYMENT_TYPE|EMPLOYMENT_TYPE_NAME|MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|IS_INTERNSHIP|SALARY|REMOTE_TYPE|REMOTE_TYPE_NAME|ORIGINAL_PAY_PERIOD|SALARY_TO|SALARY_FROM| LOCATION| CITY| CITY_NAME|COUNTY| COUNTY_NAME| MSA| MSA_NAME|STATE|STATE_NAME|COUNTY_OUTGOING|COUNTY_NAME_OUTGOING|COUNTY_INCOMING|COUNTY_NAME_INCOMING|MSA_OUTGOING| MSA_NAME_OUTGOING|MSA_INCOMING| MSA_NAME_INCOMING|NAICS2| NAICS2_NAME|NAICS3| NAICS3_NAME|NAICS4| NAICS4_NAME|NAICS5| NAICS5_NAME|NAICS6| NAICS6_NAME| TITLE| TITLE_NAME| TITLE_CLEAN| SKILLS| SKILLS_NAME| SPECIALIZED_SKILLS|SPECIALIZED_SKILLS_NAME| CERTIFICATIONS| CERTIFICATIONS_NAME| COMMON_SKILLS| COMMON_SKILLS_NAME| SOFTWARE_SKILLS|SOFTWARE_SKILLS_NAME| ONET| ONET_NAME| ONET_2019| ONET_2019_NAME| CIP6| CIP6_NAME| CIP4| CIP4_NAME| CIP2| CIP2_NAME|SOC_2021_2| SOC_2021_2_NAME|SOC_2021_3| SOC_2021_3_NAME|SOC_2021_4|SOC_2021_4_NAME|SOC_2021_5|SOC_2021_5_NAME|LOT_CAREER_AREA|LOT_CAREER_AREA_NAME|LOT_OCCUPATION| LOT_OCCUPATION_NAME|LOT_SPECIALIZED_OCCUPATION|LOT_SPECIALIZED_OCCUPATION_NAME|LOT_OCCUPATION_GROUP|LOT_OCCUPATION_GROUP_NAME|LOT_V6_SPECIALIZED_OCCUPATION|LOT_V6_SPECIALIZED_OCCUPATION_NAME|LOT_V6_OCCUPATION|LOT_V6_OCCUPATION_NAME|LOT_V6_OCCUPATION_GROUP|LOT_V6_OCCUPATION_GROUP_NAME|LOT_V6_CAREER_AREA|LOT_V6_CAREER_AREA_NAME| SOC_2| SOC_2_NAME| SOC_3| SOC_3_NAME| SOC_4| SOC_4_NAME| SOC_5| SOC_5_NAME|LIGHTCAST_SECTORS|LIGHTCAST_SECTORS_NAME|NAICS_2022_2| NAICS_2022_2_NAME|NAICS_2022_3| NAICS_2022_3_NAME|NAICS_2022_4| NAICS_2022_4_NAME|NAICS_2022_5| NAICS_2022_5_NAME|NAICS_2022_6| NAICS_2022_6_NAME|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
|1f57d95acf4dc67ed...| 9/6/2024| 2024-09-06 20:32:...| 0|6/2/2024| 6/8/2024| 6| [\n "Company"\n]|[\n "brassring.c...|[\n "https://sjo...| []| NULL|Enterprise Analys...|31-May-2024\n\nEn...| 6/8/2024| 6| 894731| Murphy USA| Murphy USA| false| [\n 2\n]| [\n "Bachelor's ...| 2| Bachelor's degree| NULL| NULL| 1|Full-time (> 32 h...| 2| 2| false| NULL| 0| [None]| NULL| NULL| NULL|{\n "lat": 33.20...|RWwgRG9yYWRvLCBBUg==|El Dorado, AR| 5139| Union, AR|20980| El Dorado, AR| 5| Arkansas| 5139| Union, AR| 5139| Union, AR| 20980| El Dorado, AR| 20980| El Dorado, AR| 44| Retail Trade| 441|Motor Vehicle and...| 4413|Automotive Parts,...| 44133|Automotive Parts ...|441330|Automotive Parts ...|ET29C073C03D1F86B4|Enterprise Analysts|enterprise analys...|[\n "KS126DB6T06...|[\n "Merchandisi...|[\n "KS126DB6T06...| [\n "Merchandisi...| []| []|[\n "KS126706DPF...|[\n "Mathematics...|[\n "KS440W865GC...|[\n "SQL (Progra...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|[\n "45.0601",\n...|[\n "Economics, ...|[\n "45.06",\n ...|[\n "Economics",...|[\n "45",\n "27...|[\n "Social Scie...| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101011| General ERP Analy...| 2310| Business Intellig...| 23101011| General ERP Analy...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| [\n 7\n]| [\n "Artificial ...| 44| Retail Trade| 441|Motor Vehicle and...| 4413|Automotive Parts,...| 44133|Automotive Parts ...| 441330|Automotive Parts ...|
|0cb072af26757b6c4...| 8/2/2024| 2024-08-02 17:08:...| 0|6/2/2024| 8/1/2024| NULL| [\n "Job Board"\n]| [\n "maine.gov"\n]|[\n "https://job...| []| NULL|Oracle Consultant...|Oracle Consultant...| 8/1/2024| NULL| 133098|Smx Corporation L...| SMX| true| [\n 99\n]| [\n "No Educatio...| 99|No Education Listed| NULL| NULL| 1|Full-time (> 32 h...| 3| 3| false| NULL| 1| Remote| NULL| NULL| NULL|{\n "lat": 44.31...| QXVndXN0YSwgTUU=| Augusta, ME| 23011| Kennebec, ME|12300|Augusta-Watervill...| 23| Maine| 23011| Kennebec, ME| 23011| Kennebec, ME| 12300|Augusta-Watervill...| 12300|Augusta-Watervill...| 56|Administrative an...| 561|Administrative an...| 5613| Employment Services| 56132|Temporary Help Se...|561320|Temporary Help Se...|ET21DDA63780A7DC09| Oracle Consultants|oracle consultant...|[\n "KS122626T55...|[\n "Procurement...|[\n "KS122626T55...| [\n "Procurement...| []| []| []| []|[\n "BGSBF3F508F...|[\n "Oracle Busi...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101012| Oracle Consultant...| 2310| Business Intellig...| 23101012| Oracle Consultant...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 56|Administrative an...| 561|Administrative an...| 5613| Employment Services| 56132|Temporary Help Se...| 561320|Temporary Help Se...|
|85318b12b3331fa49...| 9/6/2024| 2024-09-06 20:32:...| 1|6/2/2024| 7/7/2024| 35| [\n "Job Board"\n]|[\n "dejobs.org"\n]|[\n "https://dej...| []| NULL| Data Analyst|Taking care of pe...| 6/10/2024| 8|39063746| Sedgwick| Sedgwick| false| [\n 2\n]| [\n "Bachelor's ...| 2| Bachelor's degree| NULL| NULL| 1|Full-time (> 32 h...| 5| NULL| false| NULL| 0| [None]| NULL| NULL| NULL|{\n "lat": 32.77...| RGFsbGFzLCBUWA==| Dallas, TX| 48113| Dallas, TX|19100|Dallas-Fort Worth...| 48| Texas| 48113| Dallas, TX| 48113| Dallas, TX| 19100|Dallas-Fort Worth...| 19100|Dallas-Fort Worth...| 52|Finance and Insur...| 524|Insurance Carrier...| 5242|Agencies, Brokera...| 52429|Other Insurance R...|524291| Claims Adjusting|ET3037E0C947A02404| Data Analysts| data analyst|[\n "KS1218W78FG...|[\n "Management"...|[\n "ESF3939CE1F...| [\n "Exception R...|[\n "KS683TN76T7...|[\n "Security Cl...|[\n "KS1218W78FG...|[\n "Management"...|[\n "KS126HY6YLT...|[\n "Microsoft O...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 52|Finance and Insur...| 524|Insurance Carrier...| 5242|Agencies, Brokera...| 52429|Other Insurance R...| 524291| Claims Adjusting|
|1b5c3941e54a1889e...| 9/6/2024| 2024-09-06 20:32:...| 1|6/2/2024|7/20/2024| 48| [\n "Job Board"\n]|[\n "disabledper...|[\n "https://www...| []| NULL|Sr. Lead Data Mgm...|About this role:\...| 6/12/2024| 10|37615159| Wells Fargo|Wells Fargo| false| [\n 99\n]| [\n "No Educatio...| 99|No Education Listed| NULL| NULL| 1|Full-time (> 32 h...| 3| NULL| false| NULL| 0| [None]| NULL| NULL| NULL|{\n "lat": 33.44...| UGhvZW5peCwgQVo=| Phoenix, AZ| 4013| Maricopa, AZ|38060|Phoenix-Mesa-Chan...| 4| Arizona| 4013| Maricopa, AZ| 4013| Maricopa, AZ| 38060|Phoenix-Mesa-Chan...| 38060|Phoenix-Mesa-Chan...| 52|Finance and Insur...| 522|Credit Intermedia...| 5221|Depository Credit...| 52211| Commercial Banking|522110| Commercial Banking|ET2114E0404BA30075|Management Analysts|sr lead data mgmt...|[\n "KS123QX62QY...|[\n "Exit Strate...|[\n "KS123QX62QY...| [\n "Exit Strate...| []| []|[\n "KS7G6NP6R6L...|[\n "Reliability...|[\n "KS4409D76NW...|[\n "SAS (Softwa...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| [\n 6\n]| [\n "Data Privac...| 52|Finance and Insur...| 522|Credit Intermedia...| 5221|Depository Credit...| 52211| Commercial Banking| 522110| Commercial Banking|
|cb5ca25f02bdf25c1...| 6/19/2024| 2024-06-19 07:00:00| 0|6/2/2024|6/17/2024| 15|[\n "FreeJobBoar...|[\n "craigslist....|[\n "https://mod...| []| NULL|Comisiones de $10...|Comisiones de $10...| 6/17/2024| 15| 0| Unclassified| LH/GM| false| [\n 99\n]| [\n "No Educatio...| 99|No Education Listed| NULL| NULL| 3|Part-time / full-...| NULL| NULL| false| 92500| 0| [None]| year| 150000| 35000|{\n "lat": 37.63...| TW9kZXN0bywgQ0E=| Modesto, CA| 6099|Stanislaus, CA|33700| Modesto, CA| 6|California| 6099| Stanislaus, CA| 6099| Stanislaus, CA| 33700| Modesto, CA| 33700| Modesto, CA| 99|Unclassified Indu...| 999|Unclassified Indu...| 9999|Unclassified Indu...| 99999|Unclassified Indu...|999999|Unclassified Indu...|ET0000000000000000| Unclassified|comisiones de por...| []| []| []| []| []| []| []| []| []| []|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101012| Oracle Consultant...| 2310| Business Intellig...| 23101012| Oracle Consultant...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 99|Unclassified Indu...| 999|Unclassified Indu...| 9999|Unclassified Indu...| 99999|Unclassified Indu...| 999999|Unclassified Indu...|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+
only showing top 5 rows
df = (
df.withColumn("SALARY_FROM", col("SALARY_FROM").cast("float"))
.withColumn("SALARY_TO", col("SALARY_TO").cast("float"))
.withColumn("SALARY", col("SALARY").cast("float"))
.withColumn("MIN_YEARS_EXPERIENCE", col("MIN_YEARS_EXPERIENCE").cast("float"))
.withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))
)# define a method
def compute_median(sdf,col_name):
q = sdf.approxQuantile(col_name,[0.5],0.01)
return q[0] if q else Nonemedian_from = compute_median(df,"SALARY_FROM")
median_to = compute_median(df, "SALARY_TO")
median_salary = compute_median(df, "SALARY")
print("medians: ", median_from, median_to, median_salary)[Stage 16:> (0 + 1) / 1] [Stage 17:> (0 + 1) / 1] [Stage 18:> (0 + 1) / 1]
medians: 87295.0 130042.0 115024.0
df = df.fillna({
"SALARY_FROM": median_from,
"SALARY_TO": median_to,
"SALARY": median_salary})df = df.withColumn("Average Salary",(col("SALARY_FROM")+col("SALARY_TO"))/2)print(df.columns) ['ID', 'LAST_UPDATED_DATE', 'LAST_UPDATED_TIMESTAMP', 'DUPLICATES', 'POSTED', 'EXPIRED', 'DURATION', 'SOURCE_TYPES', 'SOURCES', 'URL', 'ACTIVE_URLS', 'ACTIVE_SOURCES_INFO', 'TITLE_RAW', 'BODY', 'MODELED_EXPIRED', 'MODELED_DURATION', 'COMPANY', 'COMPANY_NAME', 'COMPANY_RAW', 'COMPANY_IS_STAFFING', 'EDUCATION_LEVELS', 'EDUCATION_LEVELS_NAME', 'MIN_EDULEVELS', 'MIN_EDULEVELS_NAME', 'MAX_EDULEVELS', 'MAX_EDULEVELS_NAME', 'EMPLOYMENT_TYPE', 'EMPLOYMENT_TYPE_NAME', 'MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE', 'IS_INTERNSHIP', 'SALARY', 'REMOTE_TYPE', 'REMOTE_TYPE_NAME', 'ORIGINAL_PAY_PERIOD', 'SALARY_TO', 'SALARY_FROM', 'LOCATION', 'CITY', 'CITY_NAME', 'COUNTY', 'COUNTY_NAME', 'MSA', 'MSA_NAME', 'STATE', 'STATE_NAME', 'COUNTY_OUTGOING', 'COUNTY_NAME_OUTGOING', 'COUNTY_INCOMING', 'COUNTY_NAME_INCOMING', 'MSA_OUTGOING', 'MSA_NAME_OUTGOING', 'MSA_INCOMING', 'MSA_NAME_INCOMING', 'NAICS2', 'NAICS2_NAME', 'NAICS3', 'NAICS3_NAME', 'NAICS4', 'NAICS4_NAME', 'NAICS5', 'NAICS5_NAME', 'NAICS6', 'NAICS6_NAME', 'TITLE', 'TITLE_NAME', 'TITLE_CLEAN', 'SKILLS', 'SKILLS_NAME', 'SPECIALIZED_SKILLS', 'SPECIALIZED_SKILLS_NAME', 'CERTIFICATIONS', 'CERTIFICATIONS_NAME', 'COMMON_SKILLS', 'COMMON_SKILLS_NAME', 'SOFTWARE_SKILLS', 'SOFTWARE_SKILLS_NAME', 'ONET', 'ONET_NAME', 'ONET_2019', 'ONET_2019_NAME', 'CIP6', 'CIP6_NAME', 'CIP4', 'CIP4_NAME', 'CIP2', 'CIP2_NAME', 'SOC_2021_2', 'SOC_2021_2_NAME', 'SOC_2021_3', 'SOC_2021_3_NAME', 'SOC_2021_4', 'SOC_2021_4_NAME', 'SOC_2021_5', 'SOC_2021_5_NAME', 'LOT_CAREER_AREA', 'LOT_CAREER_AREA_NAME', 'LOT_OCCUPATION', 'LOT_OCCUPATION_NAME', 'LOT_SPECIALIZED_OCCUPATION', 'LOT_SPECIALIZED_OCCUPATION_NAME', 'LOT_OCCUPATION_GROUP', 'LOT_OCCUPATION_GROUP_NAME', 'LOT_V6_SPECIALIZED_OCCUPATION', 'LOT_V6_SPECIALIZED_OCCUPATION_NAME', 'LOT_V6_OCCUPATION', 'LOT_V6_OCCUPATION_NAME', 'LOT_V6_OCCUPATION_GROUP', 'LOT_V6_OCCUPATION_GROUP_NAME', 'LOT_V6_CAREER_AREA', 'LOT_V6_CAREER_AREA_NAME', 'SOC_2', 'SOC_2_NAME', 'SOC_3', 'SOC_3_NAME', 'SOC_4', 'SOC_4_NAME', 'SOC_5', 'SOC_5_NAME', 'LIGHTCAST_SECTORS', 'LIGHTCAST_SECTORS_NAME', 'NAICS_2022_2', 'NAICS_2022_2_NAME', 'NAICS_2022_3', 'NAICS_2022_3_NAME', 'NAICS_2022_4', 'NAICS_2022_4_NAME', 'NAICS_2022_5', 'NAICS_2022_5_NAME', 'NAICS_2022_6', 'NAICS_2022_6_NAME', 'Average Salary']
from pyspark.sql.functions import regexp_replace, col
df = df.withColumn(
"EDUCATION_LEVELS_NAME",
regexp_replace(col("EDUCATION_LEVELS_NAME"), "[\n\r]", "")
)#parse
export_cols = [
"EDUCATION_LEVELS_NAME",
"REMOTE_TYPE_NAME",
"MAX_YEARS_EXPERIENCE",
"Average Salary",
"LOT_V6_SPECIALIZED_OCCUPATION_NAME"
]df_selected = df.select(export_cols)
df_selected.show(10)+---------------------+----------------+--------------------+--------------+----------------------------------+
|EDUCATION_LEVELS_NAME|REMOTE_TYPE_NAME|MAX_YEARS_EXPERIENCE|Average Salary|LOT_V6_SPECIALIZED_OCCUPATION_NAME|
+---------------------+----------------+--------------------+--------------+----------------------------------+
| [ "Bachelor's de...| [None]| 2.0| 108668.5| General ERP Analy...|
| [ "No Education ...| Remote| 3.0| 108668.5| Oracle Consultant...|
| [ "Bachelor's de...| [None]| NULL| 108668.5| Data Analyst|
| [ "No Education ...| [None]| NULL| 108668.5| Data Analyst|
| [ "No Education ...| [None]| NULL| 92500.0| Oracle Consultant...|
| [ "Bachelor's de...| Remote| NULL| 110155.0| Data Analyst|
| [ "Bachelor's de...| [None]| NULL| 108668.5| Data Analyst|
| [ "Bachelor's de...| [None]| NULL| 108668.5| Data Analyst|
| [ "No Education ...| [None]| 7.0| 108668.5| General ERP Analy...|
| [ "Bachelor's de...| [None]| 2.0| 92962.0| Data Analyst|
+---------------------+----------------+--------------------+--------------+----------------------------------+
only showing top 10 rows
pdf = df_selected.toPandas()
pdf.to_csv("./data/lighthouse_cleaned.csv", index=False)
print(len(pdf))[Stage 20:> (0 + 1) / 1]
72498
median_salaries = pdf.groupby("LOT_V6_SPECIALIZED_OCCUPATION_NAME")["Average Salary"].median()sorted_employment_types = median_salaries.sort_values(ascending=False).indexpdf["LOT_V6_SPECIALIZED_OCCUPATION_NAME"] = pd.Categorical(
pdf["LOT_V6_SPECIALIZED_OCCUPATION_NAME"],
categories=sorted_employment_types,
ordered=True
)fig = px.box(
pdf,
x="LOT_V6_SPECIALIZED_OCCUPATION_NAME",
y="Average Salary"
)fig.show()print(df.columns) ['ID', 'LAST_UPDATED_DATE', 'LAST_UPDATED_TIMESTAMP', 'DUPLICATES', 'POSTED', 'EXPIRED', 'DURATION', 'SOURCE_TYPES', 'SOURCES', 'URL', 'ACTIVE_URLS', 'ACTIVE_SOURCES_INFO', 'TITLE_RAW', 'BODY', 'MODELED_EXPIRED', 'MODELED_DURATION', 'COMPANY', 'COMPANY_NAME', 'COMPANY_RAW', 'COMPANY_IS_STAFFING', 'EDUCATION_LEVELS', 'EDUCATION_LEVELS_NAME', 'MIN_EDULEVELS', 'MIN_EDULEVELS_NAME', 'MAX_EDULEVELS', 'MAX_EDULEVELS_NAME', 'EMPLOYMENT_TYPE', 'EMPLOYMENT_TYPE_NAME', 'MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE', 'IS_INTERNSHIP', 'SALARY', 'REMOTE_TYPE', 'REMOTE_TYPE_NAME', 'ORIGINAL_PAY_PERIOD', 'SALARY_TO', 'SALARY_FROM', 'LOCATION', 'CITY', 'CITY_NAME', 'COUNTY', 'COUNTY_NAME', 'MSA', 'MSA_NAME', 'STATE', 'STATE_NAME', 'COUNTY_OUTGOING', 'COUNTY_NAME_OUTGOING', 'COUNTY_INCOMING', 'COUNTY_NAME_INCOMING', 'MSA_OUTGOING', 'MSA_NAME_OUTGOING', 'MSA_INCOMING', 'MSA_NAME_INCOMING', 'NAICS2', 'NAICS2_NAME', 'NAICS3', 'NAICS3_NAME', 'NAICS4', 'NAICS4_NAME', 'NAICS5', 'NAICS5_NAME', 'NAICS6', 'NAICS6_NAME', 'TITLE', 'TITLE_NAME', 'TITLE_CLEAN', 'SKILLS', 'SKILLS_NAME', 'SPECIALIZED_SKILLS', 'SPECIALIZED_SKILLS_NAME', 'CERTIFICATIONS', 'CERTIFICATIONS_NAME', 'COMMON_SKILLS', 'COMMON_SKILLS_NAME', 'SOFTWARE_SKILLS', 'SOFTWARE_SKILLS_NAME', 'ONET', 'ONET_NAME', 'ONET_2019', 'ONET_2019_NAME', 'CIP6', 'CIP6_NAME', 'CIP4', 'CIP4_NAME', 'CIP2', 'CIP2_NAME', 'SOC_2021_2', 'SOC_2021_2_NAME', 'SOC_2021_3', 'SOC_2021_3_NAME', 'SOC_2021_4', 'SOC_2021_4_NAME', 'SOC_2021_5', 'SOC_2021_5_NAME', 'LOT_CAREER_AREA', 'LOT_CAREER_AREA_NAME', 'LOT_OCCUPATION', 'LOT_OCCUPATION_NAME', 'LOT_SPECIALIZED_OCCUPATION', 'LOT_SPECIALIZED_OCCUPATION_NAME', 'LOT_OCCUPATION_GROUP', 'LOT_OCCUPATION_GROUP_NAME', 'LOT_V6_SPECIALIZED_OCCUPATION', 'LOT_V6_SPECIALIZED_OCCUPATION_NAME', 'LOT_V6_OCCUPATION', 'LOT_V6_OCCUPATION_NAME', 'LOT_V6_OCCUPATION_GROUP', 'LOT_V6_OCCUPATION_GROUP_NAME', 'LOT_V6_CAREER_AREA', 'LOT_V6_CAREER_AREA_NAME', 'SOC_2', 'SOC_2_NAME', 'SOC_3', 'SOC_3_NAME', 'SOC_4', 'SOC_4_NAME', 'SOC_5', 'SOC_5_NAME', 'LIGHTCAST_SECTORS', 'LIGHTCAST_SECTORS_NAME', 'NAICS_2022_2', 'NAICS_2022_2_NAME', 'NAICS_2022_3', 'NAICS_2022_3_NAME', 'NAICS_2022_4', 'NAICS_2022_4_NAME', 'NAICS_2022_5', 'NAICS_2022_5_NAME', 'NAICS_2022_6', 'NAICS_2022_6_NAME', 'Average Salary']
from pyspark.sql.functions import lit
df = df.withColumn("counter", lit(1))#parse
export_cols2 = [
"LOT_V6_SPECIALIZED_OCCUPATION_NAME",
"Average Salary",
"counter"
]df_selected2 = df.select(export_cols2)
df_selected2.show(40)+----------------------------------+--------------+-------+
|LOT_V6_SPECIALIZED_OCCUPATION_NAME|Average Salary|counter|
+----------------------------------+--------------+-------+
| General ERP Analy...| 108668.5| 1|
| Oracle Consultant...| 108668.5| 1|
| Data Analyst| 108668.5| 1|
| Data Analyst| 108668.5| 1|
| Oracle Consultant...| 92500.0| 1|
| Data Analyst| 110155.0| 1|
| Data Analyst| 108668.5| 1|
| Data Analyst| 108668.5| 1|
| General ERP Analy...| 108668.5| 1|
| Data Analyst| 92962.0| 1|
| Data Analyst| 107645.5| 1|
| Data Analyst| 108668.5| 1|
| Data Analyst| 108668.5| 1|
| General ERP Analy...| 192800.0| 1|
| Enterprise Architect| 81286.0| 1|
| Data Analyst| 108668.5| 1|
| General ERP Analy...| 125900.0| 1|
| Oracle Consultant...| 108668.5| 1|
| Enterprise Architect| 165000.0| 1|
| Data Analyst| 170000.0| 1|
| Data Analyst| 110155.0| 1|
| Enterprise Architect| 136950.0| 1|
| Data Analyst| 118560.0| 1|
| Enterprise Architect| 108668.5| 1|
| Business Analyst ...| 108668.5| 1|
| Data Analyst| 108668.5| 1|
| Enterprise Architect| 79000.0| 1|
| SAP Analyst / Admin| 41600.0| 1|
| Business Intellig...| 108668.5| 1|
| Data Analyst| 108668.5| 1|
| Data Analyst| 140756.5| 1|
| General ERP Analy...| 192800.0| 1|
| Oracle Consultant...| 75026.0| 1|
| General ERP Analy...| 116500.0| 1|
| Oracle Consultant...| 166500.0| 1|
| Oracle Consultant...| 108668.5| 1|
| Business Analyst ...| 108668.5| 1|
| Data Analyst| 42500.0| 1|
| Data Analyst| 156038.5| 1|
| Data Analyst| 108668.5| 1|
+----------------------------------+--------------+-------+
only showing top 40 rows
pdf2 = df_selected2.toPandas()
#pdf2.to_csv("./data/lighthouse_cleaned.csv", index=False)
#print(len(pdf2))
pdf2.head(30)[Stage 22:> (0 + 1) / 1]
| LOT_V6_SPECIALIZED_OCCUPATION_NAME | Average Salary | counter | |
|---|---|---|---|
| 0 | General ERP Analyst / Consultant | 108668.5 | 1 |
| 1 | Oracle Consultant / Analyst | 108668.5 | 1 |
| 2 | Data Analyst | 108668.5 | 1 |
| 3 | Data Analyst | 108668.5 | 1 |
| 4 | Oracle Consultant / Analyst | 92500.0 | 1 |
| 5 | Data Analyst | 110155.0 | 1 |
| 6 | Data Analyst | 108668.5 | 1 |
| 7 | Data Analyst | 108668.5 | 1 |
| 8 | General ERP Analyst / Consultant | 108668.5 | 1 |
| 9 | Data Analyst | 92962.0 | 1 |
| 10 | Data Analyst | 107645.5 | 1 |
| 11 | Data Analyst | 108668.5 | 1 |
| 12 | Data Analyst | 108668.5 | 1 |
| 13 | General ERP Analyst / Consultant | 192800.0 | 1 |
| 14 | Enterprise Architect | 81286.0 | 1 |
| 15 | Data Analyst | 108668.5 | 1 |
| 16 | General ERP Analyst / Consultant | 125900.0 | 1 |
| 17 | Oracle Consultant / Analyst | 108668.5 | 1 |
| 18 | Enterprise Architect | 165000.0 | 1 |
| 19 | Data Analyst | 170000.0 | 1 |
| 20 | Data Analyst | 110155.0 | 1 |
| 21 | Enterprise Architect | 136950.0 | 1 |
| 22 | Data Analyst | 118560.0 | 1 |
| 23 | Enterprise Architect | 108668.5 | 1 |
| 24 | Business Analyst (General) | 108668.5 | 1 |
| 25 | Data Analyst | 108668.5 | 1 |
| 26 | Enterprise Architect | 79000.0 | 1 |
| 27 | SAP Analyst / Admin | 41600.0 | 1 |
| 28 | Business Intelligence Analyst | 108668.5 | 1 |
| 29 | Data Analyst | 108668.5 | 1 |
median_salaries2 = pdf2.groupby("LOT_V6_SPECIALIZED_OCCUPATION_NAME").agg({
"Average Salary": "median",
"counter": "sum"
}).reset_index()#sorted_employment_types2 = median_salaries2.sort_values(ascending=False).index
median_salaries2.head()| LOT_V6_SPECIALIZED_OCCUPATION_NAME | Average Salary | counter | |
|---|---|---|---|
| 0 | Business Analyst (General) | 108668.5 | 4326 |
| 1 | Business Intelligence Analyst | 108668.5 | 3639 |
| 2 | Data Analyst | 108668.5 | 27832 |
| 3 | Data Quality Analyst | 108668.5 | 1070 |
| 4 | Enterprise Architect | 108668.5 | 8212 |
fig2 = px.scatter(
median_salaries2,
x="LOT_V6_SPECIALIZED_OCCUPATION_NAME",
y="Average Salary",
size="counter",
hover_name="LOT_V6_SPECIALIZED_OCCUPATION_NAME",
size_max=60
)
fig2.update_layout(
xaxis_title="Occupation",
yaxis_title="Median Salary",
title="Bubble Chart of Jobs: Median Salary vs. Occupation (Bubble = # Postings)",
xaxis_tickangle=45 # rotate labels if long
)df = df.withColumn(
"education_group",
F.when(F.col("MIN_EDULEVELS_NAME").isin("GED", "Associate", "No Education Listed"), "Associate's or lower")
.when(F.col("MIN_EDULEVELS_NAME") == "Bachelor's degree", "Bachelor's")
.when(F.col("MIN_EDULEVELS_NAME").isin("Master's degree"), "Master's")
.when(F.col("MIN_EDULEVELS_NAME").isin("PhD", "Doctorate", "professional degree"), "PhD")
.otherwise("Other") # optional catch-all for unexpected values
)
df.show()+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+---------+--------------------+--------------------+-------------------+--------------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+--------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+------------------+------+--------------------+-----+--------------------+-----+-------------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+--------------+-------+--------------------+
| ID|LAST_UPDATED_DATE|LAST_UPDATED_TIMESTAMP|DUPLICATES| POSTED| EXPIRED|DURATION| SOURCE_TYPES| SOURCES| URL|ACTIVE_URLS|ACTIVE_SOURCES_INFO| TITLE_RAW| BODY|MODELED_EXPIRED|MODELED_DURATION| COMPANY| COMPANY_NAME| COMPANY_RAW|COMPANY_IS_STAFFING| EDUCATION_LEVELS|EDUCATION_LEVELS_NAME|MIN_EDULEVELS| MIN_EDULEVELS_NAME|MAX_EDULEVELS|MAX_EDULEVELS_NAME|EMPLOYMENT_TYPE|EMPLOYMENT_TYPE_NAME|MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|IS_INTERNSHIP| SALARY|REMOTE_TYPE|REMOTE_TYPE_NAME|ORIGINAL_PAY_PERIOD|SALARY_TO|SALARY_FROM| LOCATION| CITY| CITY_NAME|COUNTY| COUNTY_NAME| MSA| MSA_NAME|STATE| STATE_NAME|COUNTY_OUTGOING|COUNTY_NAME_OUTGOING|COUNTY_INCOMING|COUNTY_NAME_INCOMING|MSA_OUTGOING| MSA_NAME_OUTGOING|MSA_INCOMING| MSA_NAME_INCOMING|NAICS2| NAICS2_NAME|NAICS3| NAICS3_NAME|NAICS4| NAICS4_NAME|NAICS5| NAICS5_NAME|NAICS6| NAICS6_NAME| TITLE| TITLE_NAME| TITLE_CLEAN| SKILLS| SKILLS_NAME| SPECIALIZED_SKILLS|SPECIALIZED_SKILLS_NAME| CERTIFICATIONS| CERTIFICATIONS_NAME| COMMON_SKILLS| COMMON_SKILLS_NAME| SOFTWARE_SKILLS|SOFTWARE_SKILLS_NAME| ONET| ONET_NAME| ONET_2019| ONET_2019_NAME| CIP6| CIP6_NAME| CIP4| CIP4_NAME| CIP2| CIP2_NAME|SOC_2021_2| SOC_2021_2_NAME|SOC_2021_3| SOC_2021_3_NAME|SOC_2021_4|SOC_2021_4_NAME|SOC_2021_5|SOC_2021_5_NAME|LOT_CAREER_AREA|LOT_CAREER_AREA_NAME|LOT_OCCUPATION| LOT_OCCUPATION_NAME|LOT_SPECIALIZED_OCCUPATION|LOT_SPECIALIZED_OCCUPATION_NAME|LOT_OCCUPATION_GROUP|LOT_OCCUPATION_GROUP_NAME|LOT_V6_SPECIALIZED_OCCUPATION|LOT_V6_SPECIALIZED_OCCUPATION_NAME|LOT_V6_OCCUPATION|LOT_V6_OCCUPATION_NAME|LOT_V6_OCCUPATION_GROUP|LOT_V6_OCCUPATION_GROUP_NAME|LOT_V6_CAREER_AREA|LOT_V6_CAREER_AREA_NAME| SOC_2| SOC_2_NAME| SOC_3| SOC_3_NAME| SOC_4| SOC_4_NAME| SOC_5| SOC_5_NAME|LIGHTCAST_SECTORS|LIGHTCAST_SECTORS_NAME|NAICS_2022_2| NAICS_2022_2_NAME|NAICS_2022_3| NAICS_2022_3_NAME|NAICS_2022_4| NAICS_2022_4_NAME|NAICS_2022_5| NAICS_2022_5_NAME|NAICS_2022_6| NAICS_2022_6_NAME|Average Salary|counter| education_group|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+---------+--------------------+--------------------+-------------------+--------------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+--------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+------------------+------+--------------------+-----+--------------------+-----+-------------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+--------------+-------+--------------------+
|1f57d95acf4dc67ed...| 9/6/2024| 2024-09-06 20:32:...| 0|6/2/2024| 6/8/2024| 6| [\n "Company"\n]|[\n "brassring.c...|[\n "https://sjo...| []| NULL|Enterprise Analys...|31-May-2024\n\nEn...| 6/8/2024| 6| 894731| Murphy USA| Murphy USA| false| [\n 2\n]| [ "Bachelor's de...| 2| Bachelor's degree| NULL| NULL| 1|Full-time (> 32 h...| 2.0| 2.0| false|115024.0| 0| [None]| NULL| 130042.0| 87295.0|{\n "lat": 33.20...|RWwgRG9yYWRvLCBBUg==| El Dorado, AR| 5139| Union, AR|20980| El Dorado, AR| 5| Arkansas| 5139| Union, AR| 5139| Union, AR| 20980| El Dorado, AR| 20980| El Dorado, AR| 44| Retail Trade| 441|Motor Vehicle and...| 4413|Automotive Parts,...| 44133|Automotive Parts ...|441330|Automotive Parts ...|ET29C073C03D1F86B4| Enterprise Analysts|enterprise analys...|[\n "KS126DB6T06...|[\n "Merchandisi...|[\n "KS126DB6T06...| [\n "Merchandisi...| []| []|[\n "KS126706DPF...|[\n "Mathematics...|[\n "KS440W865GC...|[\n "SQL (Progra...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|[\n "45.0601",\n...|[\n "Economics, ...|[\n "45.06",\n ...|[\n "Economics",...|[\n "45",\n "27...|[\n "Social Scie...| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101011| General ERP Analy...| 2310| Business Intellig...| 23101011| General ERP Analy...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| [\n 7\n]| [\n "Artificial ...| 44| Retail Trade| 441|Motor Vehicle and...| 4413|Automotive Parts,...| 44133|Automotive Parts ...| 441330|Automotive Parts ...| 108668.5| 1| Bachelor's|
|0cb072af26757b6c4...| 8/2/2024| 2024-08-02 17:08:...| 0|6/2/2024| 8/1/2024| NULL| [\n "Job Board"\n]| [\n "maine.gov"\n]|[\n "https://job...| []| NULL|Oracle Consultant...|Oracle Consultant...| 8/1/2024| NULL| 133098|Smx Corporation L...| SMX| true| [\n 99\n]| [ "No Education ...| 99|No Education Listed| NULL| NULL| 1|Full-time (> 32 h...| 3.0| 3.0| false|115024.0| 1| Remote| NULL| 130042.0| 87295.0|{\n "lat": 44.31...| QXVndXN0YSwgTUU=| Augusta, ME| 23011| Kennebec, ME|12300|Augusta-Watervill...| 23| Maine| 23011| Kennebec, ME| 23011| Kennebec, ME| 12300|Augusta-Watervill...| 12300|Augusta-Watervill...| 56|Administrative an...| 561|Administrative an...| 5613| Employment Services| 56132|Temporary Help Se...|561320|Temporary Help Se...|ET21DDA63780A7DC09| Oracle Consultants|oracle consultant...|[\n "KS122626T55...|[\n "Procurement...|[\n "KS122626T55...| [\n "Procurement...| []| []| []| []|[\n "BGSBF3F508F...|[\n "Oracle Busi...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101012| Oracle Consultant...| 2310| Business Intellig...| 23101012| Oracle Consultant...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 56|Administrative an...| 561|Administrative an...| 5613| Employment Services| 56132|Temporary Help Se...| 561320|Temporary Help Se...| 108668.5| 1|Associate's or lower|
|85318b12b3331fa49...| 9/6/2024| 2024-09-06 20:32:...| 1|6/2/2024| 7/7/2024| 35| [\n "Job Board"\n]|[\n "dejobs.org"\n]|[\n "https://dej...| []| NULL| Data Analyst|Taking care of pe...| 6/10/2024| 8| 39063746| Sedgwick| Sedgwick| false| [\n 2\n]| [ "Bachelor's de...| 2| Bachelor's degree| NULL| NULL| 1|Full-time (> 32 h...| 5.0| NULL| false|115024.0| 0| [None]| NULL| 130042.0| 87295.0|{\n "lat": 32.77...| RGFsbGFzLCBUWA==| Dallas, TX| 48113| Dallas, TX|19100|Dallas-Fort Worth...| 48| Texas| 48113| Dallas, TX| 48113| Dallas, TX| 19100|Dallas-Fort Worth...| 19100|Dallas-Fort Worth...| 52|Finance and Insur...| 524|Insurance Carrier...| 5242|Agencies, Brokera...| 52429|Other Insurance R...|524291| Claims Adjusting|ET3037E0C947A02404| Data Analysts| data analyst|[\n "KS1218W78FG...|[\n "Management"...|[\n "ESF3939CE1F...| [\n "Exception R...|[\n "KS683TN76T7...|[\n "Security Cl...|[\n "KS1218W78FG...|[\n "Management"...|[\n "KS126HY6YLT...|[\n "Microsoft O...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 52|Finance and Insur...| 524|Insurance Carrier...| 5242|Agencies, Brokera...| 52429|Other Insurance R...| 524291| Claims Adjusting| 108668.5| 1| Bachelor's|
|1b5c3941e54a1889e...| 9/6/2024| 2024-09-06 20:32:...| 1|6/2/2024|7/20/2024| 48| [\n "Job Board"\n]|[\n "disabledper...|[\n "https://www...| []| NULL|Sr. Lead Data Mgm...|About this role:\...| 6/12/2024| 10| 37615159| Wells Fargo| Wells Fargo| false| [\n 99\n]| [ "No Education ...| 99|No Education Listed| NULL| NULL| 1|Full-time (> 32 h...| 3.0| NULL| false|115024.0| 0| [None]| NULL| 130042.0| 87295.0|{\n "lat": 33.44...| UGhvZW5peCwgQVo=| Phoenix, AZ| 4013| Maricopa, AZ|38060|Phoenix-Mesa-Chan...| 4| Arizona| 4013| Maricopa, AZ| 4013| Maricopa, AZ| 38060|Phoenix-Mesa-Chan...| 38060|Phoenix-Mesa-Chan...| 52|Finance and Insur...| 522|Credit Intermedia...| 5221|Depository Credit...| 52211| Commercial Banking|522110| Commercial Banking|ET2114E0404BA30075| Management Analysts|sr lead data mgmt...|[\n "KS123QX62QY...|[\n "Exit Strate...|[\n "KS123QX62QY...| [\n "Exit Strate...| []| []|[\n "KS7G6NP6R6L...|[\n "Reliability...|[\n "KS4409D76NW...|[\n "SAS (Softwa...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| [\n 6\n]| [\n "Data Privac...| 52|Finance and Insur...| 522|Credit Intermedia...| 5221|Depository Credit...| 52211| Commercial Banking| 522110| Commercial Banking| 108668.5| 1|Associate's or lower|
|cb5ca25f02bdf25c1...| 6/19/2024| 2024-06-19 07:00:00| 0|6/2/2024|6/17/2024| 15|[\n "FreeJobBoar...|[\n "craigslist....|[\n "https://mod...| []| NULL|Comisiones de $10...|Comisiones de $10...| 6/17/2024| 15| 0| Unclassified| LH/GM| false| [\n 99\n]| [ "No Education ...| 99|No Education Listed| NULL| NULL| 3|Part-time / full-...| NULL| NULL| false| 92500.0| 0| [None]| year| 150000.0| 35000.0|{\n "lat": 37.63...| TW9kZXN0bywgQ0E=| Modesto, CA| 6099| Stanislaus, CA|33700| Modesto, CA| 6| California| 6099| Stanislaus, CA| 6099| Stanislaus, CA| 33700| Modesto, CA| 33700| Modesto, CA| 99|Unclassified Indu...| 999|Unclassified Indu...| 9999|Unclassified Indu...| 99999|Unclassified Indu...|999999|Unclassified Indu...|ET0000000000000000| Unclassified|comisiones de por...| []| []| []| []| []| []| []| []| []| []|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101012| Oracle Consultant...| 2310| Business Intellig...| 23101012| Oracle Consultant...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 99|Unclassified Indu...| 999|Unclassified Indu...| 9999|Unclassified Indu...| 99999|Unclassified Indu...| 999999|Unclassified Indu...| 92500.0| 1|Associate's or lower|
|35a6cd2183d9fb270...| 9/6/2024| 2024-09-06 20:32:...| 0|6/2/2024|6/12/2024| 10| [\n "Job Board"\n]|[\n "dejobs.org"\n]|[\n "https://dej...| []| NULL|SR Lead Data Analyst|About Lumen\n\nLu...| 6/12/2024| 10| 2233642| Lumen Technologies| Lumen| false| [\n 2\n]| [ "Bachelor's de...| 2| Bachelor's degree| NULL| NULL| 1|Full-time (> 32 h...| NULL| NULL| false|110155.0| 1| Remote| year| 125890.0| 94420.0|{\n "lat": 0,\n ...|W1Vua25vd24gQ2l0e...|[Unknown City], AR| 5999|[Unknown county], AR| NULL| NULL| 5| Arkansas| 5999|[Unknown county], AR| 5999|[Unknown county], AR| NULL| NULL| NULL| NULL| 51| Information| 517| Telecommunications| 5178|All Other Telecom...| 51781|All Other Telecom...|517810|All Other Telecom...|ET95DB859B53CCACA7| Lead Data Analysts|sr lead data analyst|[\n "KS13USA80NE...|[\n "Power BI",\...|[\n "KS13USA80NE...| [\n "Power BI",\...| []| []|[\n "KS1280B68GD...|[\n "Presentatio...|[\n "KS13USA80NE...|[\n "Power BI",\...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| [\n "52.0201"\n]|[\n "Business Ad...| [\n "52.02"\n]|[\n "Business Ad...| [\n "52"\n]|[\n "Business, M...| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 51| Information| 517| Telecommunications| 5178|All Other Telecom...| 51781|All Other Telecom...| 517810|All Other Telecom...| 110155.0| 1| Bachelor's|
|06de8d192f30b1d8d...| 8/2/2024| 2024-08-02 17:08:...| 0|6/2/2024| 8/1/2024| NULL| [\n "Company"\n]|[\n "oraclecloud...|[\n "https://hct...| []| NULL| Talent Data Analyst|Id : 2501314,\nTi...| 6/22/2024| 20| 44896740|Semiconductor Com...|Semiconductor Com...| false| [\n 2\n]| [ "Bachelor's de...| 2| Bachelor's degree| NULL| NULL| 1|Full-time (> 32 h...| NULL| NULL| false|115024.0| 0| [None]| NULL| 130042.0| 87295.0|{\n "lat": 33.49...|U2NvdHRzZGFsZSwgQVo=| Scottsdale, AZ| 4013| Maricopa, AZ|38060|Phoenix-Mesa-Chan...| 4| Arizona| 4013| Maricopa, AZ| 4013| Maricopa, AZ| 38060|Phoenix-Mesa-Chan...| 38060|Phoenix-Mesa-Chan...| 31| Manufacturing| 334|Computer and Elec...| 3344|Semiconductor and...| 33441|Semiconductor and...|334413|Semiconductor and...|ETA9B609BE4E431E44| IT Data Analysts| talent data analyst|[\n "KS1250B78VW...|[\n "Interactive...|[\n "KS1250B78VW...| [\n "Interactive...| []| []|[\n "ESFA9982A2A...|[\n "Analytical ...|[\n "KS1250B78VW...|[\n "Interactive...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 31| Manufacturing| 334|Computer and Elec...| 3344|Semiconductor and...| 33441|Semiconductor and...| 334413|Semiconductor and...| 108668.5| 1| Bachelor's|
|3d589c9d84677ca94...| 9/6/2024| 2024-09-06 20:32:...| 1|6/2/2024| 7/7/2024| 35| [\n "Job Board"\n]|[\n "dejobs.org"\n]|[\n "https://dej...| []| NULL| Data Analyst|Taking care of pe...| 6/10/2024| 8| 39063746| Sedgwick| Sedgwick| false| [\n 2\n]| [ "Bachelor's de...| 2| Bachelor's degree| NULL| NULL| 1|Full-time (> 32 h...| 5.0| NULL| false|115024.0| 0| [None]| NULL| 130042.0| 87295.0|{\n "lat": 39.75...| RGF5dG9uLCBPSA==| Dayton, OH| 39113| Montgomery, OH|19430|Dayton-Kettering, OH| 39| Ohio| 39113| Montgomery, OH| 39113| Montgomery, OH| 19430|Dayton-Kettering, OH| 19430|Dayton-Kettering-...| 52|Finance and Insur...| 524|Insurance Carrier...| 5242|Agencies, Brokera...| 52429|Other Insurance R...|524291| Claims Adjusting|ET3037E0C947A02404| Data Analysts| data analyst|[\n "KS1218W78FG...|[\n "Management"...|[\n "ESF3939CE1F...| [\n "Exception R...|[\n "KS683TN76T7...|[\n "Security Cl...|[\n "KS1218W78FG...|[\n "Management"...|[\n "KS126HY6YLT...|[\n "Microsoft O...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 52|Finance and Insur...| 524|Insurance Carrier...| 5242|Agencies, Brokera...| 52429|Other Insurance R...| 524291| Claims Adjusting| 108668.5| 1| Bachelor's|
|5a843df632e1ff756...| 6/21/2024| 2024-06-21 07:00:00| 0|6/2/2024|6/20/2024| 18| [\n "Job Board"\n]|[\n "computerwor...|[\n "http://comp...| []| NULL|SAP SD/OTC Consul...|SAP SD/OTC Consul...| 6/20/2024| 18|100173263|Global Enterprise...|Global Enterprise...| true| [\n 99\n]| [ "No Education ...| 99|No Education Listed| NULL| NULL| 1|Full-time (> 32 h...| 7.0| 7.0| false|115024.0| 0| [None]| NULL| 130042.0| 87295.0|{\n "lat": 41.12...| RnJhbmtsaW4sIE5K| Franklin, NJ| 34037| Sussex, NJ|35620|New York-Newark-J...| 34| New Jersey| 34037| Sussex, NJ| 34037| Sussex, NJ| 35620|New York-Newark-J...| 35620|New York-Newark-J...| 99|Unclassified Indu...| 999|Unclassified Indu...| 9999|Unclassified Indu...| 99999|Unclassified Indu...|999999|Unclassified Indu...|ET6244BCEEC5921581| SAP OTC Consultants|sap sd otc consul...|[\n "KS1200771D9...|[\n "JavaScript ...|[\n "KS1200771D9...| [\n "JavaScript ...| []| []| []| []|[\n "KS1200771D9...|[\n "JavaScript ...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101011| General ERP Analy...| 2310| Business Intellig...| 23101011| General ERP Analy...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 99|Unclassified Indu...| 999|Unclassified Indu...| 9999|Unclassified Indu...| 99999|Unclassified Indu...| 999999|Unclassified Indu...| 108668.5| 1|Associate's or lower|
|229620073766234e8...| 10/9/2024| 2024-10-09 18:07:...| 0|6/2/2024| 8/1/2024| NULL| [\n "Company"\n]| [\n "3ds.com"\n]|[\n "https://www...| []| NULL|Sr. Marketing Ana...|Sr. Marketing Ana...| 8/1/2024| NULL| 39016169| Dassault Systèmes| Dassault Systmes| false| [\n 2,\n 3\n]| [ "Bachelor's de...| 2| Bachelor's degree| 3| Master's degree| 1|Full-time (> 32 h...| 2.0| 2.0| false| 92962.0| 0| [None]| year| 106424.0| 79500.0|{\n "lat": 40.75...| TmV3IFlvcmssIE5Z| New York, NY| 36061| New York, NY|35620|New York-Newark-J...| 36| New York| 36061| New York, NY| 36061| New York, NY| 35620|New York-Newark-J...| 35620|New York-Newark-J...| 54|Professional, Sci...| 541|Professional, Sci...| 5415|Computer Systems ...| 54151|Computer Systems ...|541511|Custom Computer P...|ET1CE3CFA5447376E9| Marketing Analysts|sr marketing analyst|[\n "KS4407N6CMT...|[\n "Salesforce"...|[\n "KS4407N6CMT...| [\n "Salesforce"...| []| []|[\n "KS7G747655V...|[\n "Prioritizat...|[\n "KS4407N6CMT...|[\n "Salesforce"...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|[\n "52.0101",\n...|[\n "Business/Co...|[\n "52.01",\n ...|[\n "Business/Co...|[\n "52",\n "45...|[\n "Business, M...| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| [\n 7\n]| [\n "Artificial ...| 54|Professional, Sci...| 541|Professional, Sci...| 5415|Computer Systems ...| 54151|Computer Systems ...| 541511|Custom Computer P...| 92962.0| 1| Bachelor's|
|b7aa80a24c82f080c...| 9/28/2024| 2024-09-28 14:06:...| 8|6/2/2024|9/27/2024| NULL|[\n "Government"...|[\n "dcscorp.com...|[\n "https://www...| []| NULL| Data Analyst|Data Analyst In R...| 7/13/2024| 41| 12147696| DCS Corporation| DCS Corp.| false|[\n 0,\n 1,\n ...| [ "High school o...| 0| High school or GED| 2| Bachelor's degree| 1|Full-time (> 32 h...| 10.0| NULL| false|107645.0| 2| Not Remote| year| 123732.0| 91559.0|{\n "lat": 35.62...|UmlkZ2VjcmVzdCwgQ0E=| Ridgecrest, CA| 6029| Kern, CA|12540| Bakersfield, CA| 6| California| 6029| Kern, CA| 6029| Kern, CA| 12540| Bakersfield, CA| 12540|Bakersfield-Delan...| 42| Wholesale Trade| 423|Merchant Wholesal...| 4238|Machinery, Equipm...| 42383|Industrial Machin...|423830|Industrial Machin...|ET3037E0C947A02404| Data Analysts| data analyst|[\n "KS128HD6KJS...|[\n "Regression ...|[\n "KS128HD6KJS...| [\n "Regression ...|[\n "KS683TN76T7...|[\n "Security Cl...|[\n "KS1203C6N9B...|[\n "Research",\...|[\n "KS125LS6N7W...|[\n "Python (Pro...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|[\n "14.0101",\n...|[\n "Engineering...|[\n "14.01",\n ...|[\n "Engineering...|[\n "14",\n "14...|[\n "Engineering...| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 42| Wholesale Trade| 423|Merchant Wholesal...| 4238|Machinery, Equipm...| 42383|Industrial Machin...| 423830|Industrial Machin...| 107645.5| 1| Other|
|2a107fd40bb1afac4...| 6/17/2024| 2024-06-17 07:00:00| 0|6/2/2024| 6/8/2024| 6| [\n "Job Board"\n]| [\n "dice.com"\n]|[\n "https://www...| []| NULL| Data Analyst|Data Analyst\nTEK...| 6/8/2024| 6| 4063994| Allegis Group|TEKsystems c/o Al...| true| [\n 99\n]| [ "No Education ...| 99|No Education Listed| NULL| NULL| 1|Full-time (> 32 h...| 2.0| NULL| false|115024.0| 0| [None]| NULL| 130042.0| 87295.0|{\n "lat": 21.30...| SG9ub2x1bHUsIEhJ| Honolulu, HI| 15003| Honolulu, HI|46520| Urban Honolulu, HI| 15| Hawaii| 15003| Honolulu, HI| 15003| Honolulu, HI| 46520| Urban Honolulu, HI| 46520| Urban Honolulu, HI| 56|Administrative an...| 561|Administrative an...| 5613| Employment Services| 56132|Temporary Help Se...|561320|Temporary Help Se...|ET3037E0C947A02404| Data Analysts| data analyst|[\n "KS7LO8P3MXB...|[\n "Data Scienc...|[\n "KS7LO8P3MXB...| [\n "Data Scienc...| []| []|[\n "KS122556LMQ...|[\n "Communicati...|[\n "KS440W865GC...|[\n "SQL (Progra...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...|[\n "11.0701",\n...|[\n "Computer Sc...|[\n "11.07",\n ...|[\n "Computer Sc...|[\n "11",\n "30...|[\n "Computer an...| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 56|Administrative an...| 561|Administrative an...| 5613| Employment Services| 56132|Temporary Help Se...| 561320|Temporary Help Se...| 108668.5| 1|Associate's or lower|
|fd48c3ce533c3d20a...| 9/6/2024| 2024-09-06 20:32:...| 0|6/2/2024| 7/5/2024| 33| [\n "Job Board"\n]|[\n "dejobs.org"\n]|[\n "https://dej...| []| NULL|Data Research Ana...|The Data Research...| 7/5/2024| 33| 34294036| Equifax| Equifax, Inc.| false| [\n 2\n]| [ "Bachelor's de...| 2| Bachelor's degree| NULL| NULL| 1|Full-time (> 32 h...| NULL| NULL| false|115024.0| 0| [None]| NULL| 130042.0| 87295.0|{\n "lat": 0,\n ...|W1Vua25vd24gQ2l0e...|[Unknown City], GA| 13999|[Unknown county], GA| NULL| NULL| 13| Georgia| 13999|[Unknown county], GA| 13999|[Unknown county], GA| NULL| NULL| NULL| NULL| 52|Finance and Insur...| 522|Credit Intermedia...| 5223|Activities Relate...| 52232|Financial Transac...|522320|Financial Transac...|ET252B42EF548117CC| Data Researchers|data research ana...|[\n "KS120GV6C72...|[\n "Data Analys...|[\n "KS120GV6C72...| [\n "Data Analys...| []| []|[\n "KS1203C6N9B...|[\n "Research",\...| []| []|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 52|Finance and Insur...| 522|Credit Intermedia...| 5223|Activities Relate...| 52232|Financial Transac...| 522320|Financial Transac...| 108668.5| 1| Bachelor's|
|57b527ea0f91db5bb...| 9/6/2024| 2024-09-06 20:32:...| 0|6/2/2024|7/27/2024| 55| [\n "Job Board"\n]|[\n "simplyhired...|[\n "https://www...| []| NULL|Power, Utilities ...|Power, Utilities ...| 7/27/2024| 55| 5732448| Deloitte| Deloitte| false| [\n 2,\n 3\n]| [ "Bachelor's de...| 2| Bachelor's degree| 3| Master's degree| 1|Full-time (> 32 h...| 6.0| NULL| false|192800.0| 0| [None]| year| 241000.0| 144600.0|{\n "lat": 42.33...| RGV0cm9pdCwgTUk=| Detroit, MI| 26163| Wayne, MI|19820|Detroit-Warren-De...| 26| Michigan| 26163| Wayne, MI| 26163| Wayne, MI| 19820|Detroit-Warren-De...| 19820|Detroit-Warren-De...| 54|Professional, Sci...| 541|Professional, Sci...| 5416|Management, Scien...| 54161|Management Consul...|541611|Administrative Ma...|ET8AEDEB1F4C3091D3|Management Consul...|power utilities r...|[\n "KS122VL71WF...|[\n "Design Spec...|[\n "KS122VL71WF...| [\n "Design Spec...| []| []|[\n "KS1218W78FG...|[\n "Management"...|[\n "KS1219W70LY...|[\n "C++ (Progra...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| [\n "45.0702"\n]|[\n "Geographic ...| [\n "45.07"\n]|[\n "Geography a...| [\n "45"\n]|[\n "Social Scie...| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101011| General ERP Analy...| 2310| Business Intellig...| 23101011| General ERP Analy...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| [\n 3\n]| [\n "Green Jobs:...| 54|Professional, Sci...| 541|Professional, Sci...| 5416|Management, Scien...| 54161|Management Consul...| 541611|Administrative Ma...| 192800.0| 1| Bachelor's|
|036cd733481fbcc98...| 8/2/2024| 2024-08-02 17:08:...| 0|6/2/2024| 8/1/2024| NULL| [\n "Job Board"\n]| [\n "ms.gov"\n]|[\n "https://win...| []| NULL|Sr. Enterprise Da...|Sr. Enterprise Da...| 6/14/2024| 12| 38205299|Lincoln Financial...|Lincoln Financial...| false| [\n 99\n]| [ "No Education ...| 99|No Education Listed| NULL| NULL| 1|Full-time (> 32 h...| NULL| NULL| false| 81286.0| 1| Remote| year| 81286.0| 81286.0|{\n "lat": 32.29...| SmFja3NvbiwgTVM=| Jackson, MS| 28049| Hinds, MS|27140| Jackson, MS| 28| Mississippi| 28049| Hinds, MS| 28049| Hinds, MS| 27140| Jackson, MS| 27140| Jackson, MS| 52|Finance and Insur...| 523|Securities, Commo...| 5239|Other Financial I...| 52394|Portfolio Managem...|523940|Portfolio Managem...|ET0000000000000000| Unclassified|sr enterprise dat...|[\n "KS122NM6B8T...|[\n "Data Archit...|[\n "KS122NM6B8T...| [\n "Data Archit...|[\n "ESE495A4017...|[\n "Valid Drive...| []| []| []| []|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231510|Computer Systems ...| 23151012| Enterprise Architect| 2315| Network and Syste...| 23151012| Enterprise Architect| 231510| Computer Systems ...| 2315| Network and Syste...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 52|Finance and Insur...| 523|Securities, Commo...| 5239|Other Financial I...| 52394|Portfolio Managem...| 523940|Portfolio Managem...| 81286.0| 1|Associate's or lower|
|138ce2c9453b47a9b...| 8/10/2024| 2024-08-10 19:36:...| 5|6/2/2024| 8/9/2024| NULL|[\n "Job Board",...|[\n "silkroad.co...|[\n "https://mai...| []| NULL|SENIOR CONSULTANT...|SENIOR CONSULTANT...| 6/8/2024| 6| 1967| Boston University| Boston University| false|[\n 1,\n 2,\n ...| [ "Associate deg...| 1| Associate degree| 3| Master's degree| 1|Full-time (> 32 h...| 5.0| 5.0| false|115024.0| 1| Remote| NULL| 130042.0| 87295.0|{\n "lat": 42.36...| Qm9zdG9uLCBNQQ==| Boston, MA| 25025| Suffolk, MA|14460|Boston-Cambridge-...| 25|Massachusetts| 25025| Suffolk, MA| 25025| Suffolk, MA| 14460|Boston-Cambridge-...| 14460|Boston-Cambridge-...| 61|Educational Services| 611|Educational Services| 6113|Colleges, Univers...| 61131|Colleges, Univers...|611310|Colleges, Univers...|ET210B837B93B7B3F9|Continuous Improv...|senior consultant...|[\n "ESB38820A54...|[\n "Effective C...|[\n "ESB38820A54...| [\n "Effective C...|[\n "KS7G2ZG794H...|[\n "Certified I...|[\n "KS1280B68GD...|[\n "Presentatio...| []| []|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| [\n "52.0201"\n]|[\n "Business Ad...| [\n "52.02"\n]|[\n "Business Ad...| [\n "52"\n]|[\n "Business, M...| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 61|Educational Services| 611|Educational Services| 6113|Colleges, Univers...| 61131|Colleges, Univers...| 611310|Colleges, Univers...| 108668.5| 1| Other|
|dd191e2ce3062c371...| 9/6/2024| 2024-09-06 20:32:...| 0|6/2/2024|6/20/2024| 18| [\n "Job Board"\n]|[\n "phoenixrecr...|[\n "https://www...| []| NULL| SAP FSCM Consultant|Job Description: ...| 6/20/2024| 18| 8592955| Accenture| Accenture| false| [\n 1,\n 2\n]| [ "Associate deg...| 1| Associate degree| 2| Bachelor's degree| 1|Full-time (> 32 h...| 12.0| NULL| false|125900.0| 0| [None]| year| 188600.0| 63200.0|{\n "lat": 0,\n ...|W1Vua25vd24gQ2l0e...|[Unknown City], AZ| 4999|[Unknown county], AZ| NULL| NULL| 4| Arizona| 4999|[Unknown county], AZ| 4999|[Unknown county], AZ| NULL| NULL| NULL| NULL| 54|Professional, Sci...| 541|Professional, Sci...| 5415|Computer Systems ...| 54151|Computer Systems ...|541512|Computer Systems ...|ETF594A2C05D212506|Peoplesoft FSCM C...| sap fscm consultant|[\n "KS7G7VL78R2...|[\n "Profit Cent...|[\n "KS7G7VL78R2...| [\n "Profit Cent...| []| []|[\n "KS122ZF75YV...|[\n "Digitizatio...|[\n "KS7G7VL78R2...|[\n "Profit Cent...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101011| General ERP Analy...| 2310| Business Intellig...| 23101011| General ERP Analy...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 54|Professional, Sci...| 541|Professional, Sci...| 5415|Computer Systems ...| 54151|Computer Systems ...| 541512|Computer Systems ...| 125900.0| 1| Other|
|99856b5a8a1c75d90...| 9/6/2024| 2024-09-06 20:32:...| 0|6/2/2024| 8/1/2024| NULL|[\n "Government"\n]|[\n "alaska.gov"\n]|[\n "https://ala...| []| NULL|Oracle Consultant...|Onsite - Work ons...| 7/10/2024| 38| 133098|Smx Corporation L...| SMX| true| [\n 99\n]| [ "No Education ...| 99|No Education Listed| NULL| NULL| 1|Full-time (> 32 h...| 3.0| 3.0| false|115024.0| 1| Remote| NULL| 130042.0| 87295.0|{\n "lat": 58.30...| SnVuZWF1LCBBSw==| Juneau, AK| 2110| Juneau Borough, AK|27940| Juneau, AK| 2| Alaska| 2110| Juneau Borough, AK| 2110| Juneau Borough, AK| 27940| Juneau, AK| 27940| Juneau, AK| 56|Administrative an...| 561|Administrative an...| 5613| Employment Services| 56132|Temporary Help Se...|561320|Temporary Help Se...|ET21DDA63780A7DC09| Oracle Consultants|oracle consultant...|[\n "KS122626T55...|[\n "Procurement...|[\n "KS122626T55...| [\n "Procurement...| []| []| []| []|[\n "BGSBF3F508F...|[\n "Oracle Busi...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231010|Business Intellig...| 23101012| Oracle Consultant...| 2310| Business Intellig...| 23101012| Oracle Consultant...| 231010| Business Intellig...| 2310| Business Intellig...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| NULL| NULL| 56|Administrative an...| 561|Administrative an...| 5613| Employment Services| 56132|Temporary Help Se...| 561320|Temporary Help Se...| 108668.5| 1|Associate's or lower|
|f28123528a32b8c9b...| 9/6/2024| 2024-09-06 20:32:...| 0|6/2/2024| 8/1/2024| NULL| [\n "Company"\n]|[\n "sca.health"\n]|[\n "https://car...| []| NULL| Principal Architect|Principal Archite...| 8/1/2024| NULL| 39192167|Surgical Care Aff...|Surgical Care Aff...| false| [\n 2\n]| [ "Bachelor's de...| 2| Bachelor's degree| NULL| NULL| 1|Full-time (> 32 h...| 8.0| 8.0| false|115024.0| 0| [None]| year| 170000.0| 160000.0|{\n "lat": 33.51...|QmlybWluZ2hhbSwgQUw=| Birmingham, AL| 1073| Jefferson, AL|13820|Birmingham-Hoover...| 1| Alabama| 1073| Jefferson, AL| 1073| Jefferson, AL| 13820|Birmingham-Hoover...| 13820| Birmingham, AL| 62|Health Care and S...| 621|Ambulatory Health...| 6214|Outpatient Care C...| 62149|Other Outpatient ...|621493|Freestanding Ambu...|ET7767EEDBF263F7B7|Principal Architects| principal architect|[\n "ES99B020D66...|[\n "Business Ob...|[\n "ES4B99FD0FD...| [\n "Infrastruct...|[\n "KS125K065BR...|[\n "Juniper Net...|[\n "ES99B020D66...|[\n "Business Ob...|[\n "KS120V86MZW...|[\n "Microsoft A...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231510|Computer Systems ...| 23151012| Enterprise Architect| 2315| Network and Syste...| 23151012| Enterprise Architect| 231510| Computer Systems ...| 2315| Network and Syste...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| [\n 5\n]| [\n "Cybersecuri...| 62|Health Care and S...| 621|Ambulatory Health...| 6214|Outpatient Care C...| 62149|Other Outpatient ...| 621493|Freestanding Ambu...| 165000.0| 1| Bachelor's|
|b4e618e8d2a2b6744...| 10/9/2024| 2024-10-09 18:07:...| 2|6/2/2024|8/11/2024| NULL| [\n "Job Board"\n]|[\n "castrovalle...|[\n "https://www...| []| NULL|Principal growth ...|Principal growth ...| 7/27/2024| 55| 40794223|Aircall Internati...| Aircall| false| [\n 99\n]| [ "No Education ...| 99|No Education Listed| NULL| NULL| 1|Full-time (> 32 h...| 6.0| NULL| false|170000.0| 0| [None]| year| 220000.0| 120000.0|{\n "lat": 37.77...|U2FuIEZyYW5jaXNjb...| San Francisco, CA| 6075| San Francisco, CA|41860|San Francisco-Oak...| 6| California| 6075| San Francisco, CA| 6075| San Francisco, CA| 41860|San Francisco-Oak...| 41860|San Francisco-Oak...| 99|Unclassified Indu...| 999|Unclassified Indu...| 9999|Unclassified Indu...| 99999|Unclassified Indu...|999999|Unclassified Indu...|ET54F46C4290228B21| Growth Analysts|principal growth ...|[\n "ESA420F05EB...|[\n "Curiosity",...|[\n "KS1218H6QYL...| [\n "Business Co...| []| []|[\n "ESA420F05EB...|[\n "Curiosity",...|[\n "KS1200364C9...|[\n "C (Programm...|15-2051.01|Business Intellig...|15-2051.01|Business Intellig...| []| []| []| []| []| []| 15-0000|Computer and Math...| 15-2000|Mathematical Scie...| 15-2050|Data Scientists| 15-2051|Data Scientists| 23|Information Techn...| 231113|Data / Data Minin...| 23111310| Data Analyst| 2311| Data Analysis and...| 23111310| Data Analyst| 231113| Data / Data Minin...| 2311| Data Analysis and...| 23| Information Techn...|15-0000|Computer and Math...|15-2000|Mathematical Scie...|15-2050|Data Scientists|15-2051|Data Scientists| [\n 6\n]| [\n "Data Privac...| 99|Unclassified Indu...| 999|Unclassified Indu...| 9999|Unclassified Indu...| 99999|Unclassified Indu...| 999999|Unclassified Indu...| 170000.0| 1|Associate's or lower|
+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+---------+--------------------+--------------------+-------------------+--------------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+--------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+------------------+------+--------------------+-----+--------------------+-----+-------------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+----------+--------------------+----------+---------------+----------+---------------+---------------+--------------------+--------------+--------------------+--------------------------+-------------------------------+--------------------+-------------------------+-----------------------------+----------------------------------+-----------------+----------------------+-----------------------+----------------------------+------------------+-----------------------+-------+--------------------+-------+--------------------+-------+---------------+-------+---------------+-----------------+----------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+------------+--------------------+--------------+-------+--------------------+
only showing top 20 rows
#parse
export_cols3 = [
"LOT_V6_SPECIALIZED_OCCUPATION_NAME",
"Average Salary",
"MAX_YEARS_EXPERIENCE"
]df_selected3 = df.select(export_cols3)
df_selected3.show(10)+----------------------------------+--------------+--------------------+
|LOT_V6_SPECIALIZED_OCCUPATION_NAME|Average Salary|MAX_YEARS_EXPERIENCE|
+----------------------------------+--------------+--------------------+
| General ERP Analy...| 108668.5| 2.0|
| Oracle Consultant...| 108668.5| 3.0|
| Data Analyst| 108668.5| NULL|
| Data Analyst| 108668.5| NULL|
| Oracle Consultant...| 92500.0| NULL|
| Data Analyst| 110155.0| NULL|
| Data Analyst| 108668.5| NULL|
| Data Analyst| 108668.5| NULL|
| General ERP Analy...| 108668.5| 7.0|
| Data Analyst| 92962.0| 2.0|
+----------------------------------+--------------+--------------------+
only showing top 10 rows
pdf3 = df_selected3.toPandas()
#pdf2.to_csv("./data/lighthouse_cleaned.csv", index=False)
#print(len(pdf2))
pdf3.head(10)[Stage 25:> (0 + 1) / 1]
| LOT_V6_SPECIALIZED_OCCUPATION_NAME | Average Salary | MAX_YEARS_EXPERIENCE | |
|---|---|---|---|
| 0 | General ERP Analyst / Consultant | 108668.5 | 2.0 |
| 1 | Oracle Consultant / Analyst | 108668.5 | 3.0 |
| 2 | Data Analyst | 108668.5 | NaN |
| 3 | Data Analyst | 108668.5 | NaN |
| 4 | Oracle Consultant / Analyst | 92500.0 | NaN |
| 5 | Data Analyst | 110155.0 | NaN |
| 6 | Data Analyst | 108668.5 | NaN |
| 7 | Data Analyst | 108668.5 | NaN |
| 8 | General ERP Analyst / Consultant | 108668.5 | 7.0 |
| 9 | Data Analyst | 92962.0 | 2.0 |
pdf3["MAX_YEARS_EXPERIENCE"] = pdf3["MAX_YEARS_EXPERIENCE"].fillna(0)
pdf3.head(10)| LOT_V6_SPECIALIZED_OCCUPATION_NAME | Average Salary | MAX_YEARS_EXPERIENCE | |
|---|---|---|---|
| 0 | General ERP Analyst / Consultant | 108668.5 | 2.0 |
| 1 | Oracle Consultant / Analyst | 108668.5 | 3.0 |
| 2 | Data Analyst | 108668.5 | 0.0 |
| 3 | Data Analyst | 108668.5 | 0.0 |
| 4 | Oracle Consultant / Analyst | 92500.0 | 0.0 |
| 5 | Data Analyst | 110155.0 | 0.0 |
| 6 | Data Analyst | 108668.5 | 0.0 |
| 7 | Data Analyst | 108668.5 | 0.0 |
| 8 | General ERP Analyst / Consultant | 108668.5 | 7.0 |
| 9 | Data Analyst | 92962.0 | 2.0 |
import matplotlib.pyplot as pltgroups = pdf3["LOT_V6_SPECIALIZED_OCCUPATION_NAME"].unique()
plt.figure(figsize=(10,6))
for g in groups:
subset = pdf3[pdf3["LOT_V6_SPECIALIZED_OCCUPATION_NAME"] == g]
# Add jitter to avoid overlapping points
x_jitter = subset["MAX_YEARS_EXPERIENCE"] + np.random.normal(0, 0.2, size=len(subset))
y_jitter = subset["Average Salary"] + np.random.normal(0, 0.2, size=len(subset))
plt.scatter(x_jitter, y_jitter, alpha=0.6, label=g)
plt.xlabel("X-axis (with jitter)")
plt.ylabel("Y-axis (with jitter)")
plt.title("Scatter Plots per Group with Jitter")
plt.legend()
plt.show()